{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import glob\n",
    "import pandas as pd\n",
    "import pickle\n",
    "from collections import Counter\n",
    "import matplotlib.pyplot as plt\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.externals import joblib\n",
    "\n",
    "%config InlineBackend.figure_format = 'svg'\n",
    "%matplotlib inline\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def yield_origin_csv(file_type):\n",
    "    flag = 1\n",
    "    id_, api_name_list, call_pid_list, ret_value_list = [], [], [], []\n",
    "    api_name_regex = re.compile('<action api_name=\"(.*?)\" call_name')\n",
    "    call_pid_regex = re.compile('call_pid=\"(.*?)\" call_time=')\n",
    "    ret_value_regex = re.compile('ret_value=\"(.*?)\"')\n",
    "    for path in glob.glob(\"./stage1_dataset/train/{}/*\".format(file_type)):\n",
    "        with open(path, \"r\") as fp:\n",
    "            xml = fp.read()\n",
    "        api_names = re.findall(api_name_regex, xml)\n",
    "        call_pids = re.findall(call_pid_regex, xml)\n",
    "        ret_values = re.findall(ret_value_regex, xml)\n",
    "        \n",
    "        id_.append(path.split(\".\")[1].split(\"/\")[-1]) \n",
    "        api_name_list.append(\" \".join(api_names))\n",
    "        call_pid_list.append(\" \".join(call_pids))\n",
    "        ret_value_list.append(\" \".join(ret_values))\n",
    "        \n",
    "        \n",
    "        if flag % 300 == 0:\n",
    "            print(flag)\n",
    "        flag += 1\n",
    "    df = pd.DataFrame()\n",
    "    df[\"id\"] = id_\n",
    "    df[\"api_name\"] = api_name_list\n",
    "    df[\"call_pid\"] = call_pid_list\n",
    "    df[\"ret_value\"] = ret_value_list\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# black = yield_origin_csv(\"black\")\n",
    "# white = yield_origin_csv(\"white\")\n",
    "# origin_test = yield_origin_csv(\"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "origin_data = pd.concat([white, black])\n",
    "origin_data[\"safe_type\"] = [0 for _ in range(20000)] + [1 for _ in range(10000)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# origin_data.to_csv(\"origin_data.csv\", encoding=\"utf-8\", index=False)\n",
    "# origin_test.to_csv(\"origin_test.csv\", encoding=\"utf-8\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "api_distinct_cnt: file调用了多少不同的API ;\n",
    "api_cnt: file调用api的总数；\n",
    "api_cnt_mean: file调用API的平均值；\n",
    "call_pid_distinct_cnt: file调用了多少不同的进程；\n",
    "call_pid_cnt_max,tid_api_cnt_min,tid_api_cnt_mean: \",\"file中的线程调用的 最多/最少/平均 api数目;\n",
    "\n",
    "value_equals0_cnt: file返回值为0的样本数;\n",
    "value_equals0_rate： file返回值为0的样本比率;\n",
    "value_distinct_cnt: file有多少不同的返回值;\n",
    "\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_value(x, kind=\"mean\"):\n",
    "    dict_ = Counter(x.split())\n",
    "    tmp = sorted(dict_.values())\n",
    "    if kind == \"mean\":\n",
    "        return sum(dict_.values()) / len(dict_)   \n",
    "    if kind == \"max\":\n",
    "        return tmp[-1]\n",
    "    if kind == \"min\":\n",
    "        return tmp[0]\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_features(data):\n",
    "    data[\"api_cnt\"] = data[\"api_name\"].apply(lambda x: len(x.split()))\n",
    "    data[\"api_distinct_cnt\"] = data[\"api_name\"].apply(lambda x: len(set(x.split())))\n",
    "    data[\"api_cnt_mean\"] = data[\"api_name\"].apply(lambda x: get_value(x))\n",
    "    data[\"call_pid_distinct_cnt\"] = data[\"call_pid\"].apply(lambda x: len(set(x.split())))\n",
    "    data[\"call_pid_cnt_mean\"] = data[\"call_pid\"].apply(lambda x: get_value(x))\n",
    "    data[\"call_pid_cnt_max\"] = data[\"call_pid\"].apply(lambda x: get_value(x, kind=\"max\"))\n",
    "    data[\"call_pid_cnt_min\"] = data[\"call_pid\"].apply(lambda x: get_value(x, kind=\"min\"))\n",
    "    data[\"ret_value_equals0_cnt\"] = data[\"ret_value\"].apply(lambda x: x.split().count('0'))\n",
    "    data[\"ret_value_equals0_rate\"] = data[\"ret_value\"].apply(lambda x: x.split().count('0') / (len(x.split())))\n",
    "    data[\"ret_value_distinct_cnt\"] = data[\"ret_value\"].apply(lambda x: len(set(x.split())))\n",
    "    data.drop([\"call_pid\", \"api_name\", \"ret_value\"], axis=1, inplace=True)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_features = make_features(origin_data)\n",
    "test_features = make_features(origin_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_features.to_csv(\"train_features.csv\", encoding=\"utf-8\", index=False)\n",
    "test_features.to_csv(\"test_features.csv\", encoding=\"utf-8\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## <center>Get n-gram features</center>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "origin_train_data = pd.read_csv(\"origin_data.csv\")\n",
    "origin_test_data = pd.read_csv(\"origin_test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data_api_name = origin_train_data[\"api_name\"]\n",
    "test_data_api_name = origin_test_data[\"api_name\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)\n",
    "train_tfidf_features = vectorizer.fit_transform(train_data_api_name.tolist())\n",
    "test_tfidf_features = vectorizer.transform(test_data_api_name.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"train_tfidf_features.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(train_tfidf_features, fp)\n",
    "with open(\"test_tfidf_features.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(test_tfidf_features, fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data_ret_value = origin_train_data[\"ret_value\"]\n",
    "test_data_ret_value = origin_test_data[\"ret_value\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)\n",
    "train_tfidf_features = vectorizer.fit_transform(train_data_ret_value.tolist())\n",
    "test_tfidf_features = vectorizer.transform(test_data_ret_value.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"train_ret_value_tfidf_features.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(train_tfidf_features, fp)\n",
    "    \n",
    "with open(\"test_ret_value_tfidf_features.pkl\", \"wb\") as fp:\n",
    "    pickle.dump(test_tfidf_features, fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
